
clear all
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** ============================================================================================================
*** Table B12. Validation of textual similarity (5 random patents).
*** Zihao Li. 11/2024
*** ============================================================================================================

import delimited $dir/temp/omission_panel5_randsample.csv, clear
keep patent_id cited_patent_id omission sim_score
rename (patent_id cited_patent_id) (patent_id_i patent_id_j)

* Merge with citing patents (i) and cited patents (j)
merge m:1 patent_id_i using $dir/temp/patent_i.dta
drop if _merge!=3
drop _merge
merge m:1 patent_id_j using $dir/temp/patent_j.dta
drop if _merge!=3
drop _merge
sort patent_id_i patent_id_j

*** Generate regression variables
* same main_cpc_subclass
gen same_main_cpc = 1 if main_cpc_subclass_i == main_cpc_subclass_j
replace same_main_cpc = 0 if same_main_cpc != 1
gen same_assignee_country = 1 if assignee_country_i == assignee_country_j & assignee_country_i != "" & assignee_country_j != ""
replace same_assignee_country = 0 if same_assignee_country != 1
gen same_assignee_location = 1 if assignee_location_id_i == assignee_location_id_j & assignee_location_id_i != "" & assignee_location_id_j != ""
replace same_assignee_location = 0 if same_assignee_location != 1
gen same_assignee = 1 if assignee_id_i == assignee_id_j
replace same_assignee = 0 if same_assignee != 1
gen years_lag = patent_year_i - patent_year_j

* KPSS commercial value (quality) variable (convert to dollar)
gen xi_dollar_real_i = xi_real_i * 1000000
gen xi_dollar_real_j = xi_real_j* 1000000
gen dollar_real_log_i = log(xi_dollar_real_i)
gen dollar_real_log_j = log(xi_dollar_real_j)

drop gender_09_100_list* gender_io_09_100_list* gender_09_50_list* gender_08_100_list* gender_08_50_list* gender_io_05_100_list* gender_io_06_100_list* gender_io_07_100_list* gender_io_08_100_list* race80_list*
drop if sim_score == 1

encode assignee_id_i, gen(assignee_id_i_enc)
encode assignee_id_j, gen(assignee_id_j_enc)
encode main_cpc_section_i, gen(main_cpc_section_i_enc)
encode main_cpc_section_j, gen(main_cpc_section_j_enc)
drop if patent_year_i==. | main_cpc_section_i=="" | main_cpc_section_j==""


*** Run Regression
* Col (1)
reg same_main_cpc sim_score, vce(cluster patent_id_i)
estadd ysumm
outreg2 using $dir/reg_results/tableb12.doc, replace dec(4) keep() addtext(Firm i FE, No, Firm j FE, No, Year i FE, No, Main CPC Section i FE, No, Main CPC Section j FE, No, Cluster SE, Citing, Sample, All) addstat(Mean of dependent variable, e(ymean))

* Col (2)
reg same_main_cpc sim_score allfemale_09_100_j, vce(cluster patent_id_i)
estadd ysumm
outreg2 using $dir/reg_results/tableb12.doc, append dec(4) keep() addtext(Firm i FE, No, Firm j FE, No, Year i FE, No, Main CPC Section i FE, No, Main CPC Section j FE, No, Cluster SE, Citing, Sample, All) addstat(Mean of dependent variable, e(ymean))

* Col (3)
reg same_main_cpc sim_score allfemale_09_100_j dollar_real_log_j num_citations_i num_inventors_i num_inventors_j avg_experience_i avg_experience_j same_assignee_country years_lag, vce(cluster patent_id_i)
estadd ysumm
outreg2 using $dir/reg_results/tableb12.doc, append dec(4) keep(allfemale_09_100_j sim_score dollar_real_log_j num_citations_i num_inventors_i num_inventors_j avg_experience_i avg_experience_j same_assignee_country years_lag) addtext(Firm i FE, No, Firm j FE, No, Year i FE, No, Main CPC Section i FE, No, Main CPC Section j FE, No, Cluster SE, Citing, Sample, All) addstat(Mean of dependent variable, e(ymean))

* Col (4)
reghdfe same_main_cpc sim_score allfemale_09_100_j dollar_real_log_j num_citations_i num_inventors_i num_inventors_j avg_experience_i avg_experience_j same_assignee_country years_lag, absorb(assignee_id_i_enc assignee_id_j_enc patent_year_i main_cpc_section_i_enc main_cpc_section_j_enc) vce(cluster patent_id_i)
estadd ysumm
outreg2 using $dir/reg_results/tableb12.doc, append dec(4) keep(sim_score allfemale_09_100_j dollar_real_log_j num_citations_i num_inventors_i num_inventors_j avg_experience_i avg_experience_j same_assignee_country years_lag) addtext(Firm i FE, Yes, Firm j FE, Yes, Year i FE, Yes, Main CPC Section i FE, Yes, Main CPC Section j FE, Yes, Cluster SE, Citing, Sample, All) addstat(Mean of dependent variable, e(ymean))

